Figures in supplementary material

This document will allow you to reproduce all supplementary figures.

Set working directory and load all necessary libraries and functions.

# If not installed, install and load the package "here", else: only load the package.
err <- try(library("here", character.only = TRUE), silent = TRUE)
## here() starts at /Users/lgoeminn/Documents/phD/MSqRobHurdlePaper
if (class(err) == 'try-error') {
  install.packages("here", repos = "https://cloud.r-project.org")
  library("here", character.only = TRUE)
}

wd <- here()

# Optional: change the working directory
setwd(wd)

# Load all functions and libraries
source(paste0(wd, "/R/functions.r"))
## Loading required namespace: BiocManager
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## ── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.1
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: mzR
## Loading required package: Rcpp
## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (1.0.0)
## than is installed on your system (1.0.1). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at 
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: ProtGenerics
## 
## This is MSnbase version 2.8.3 
##   Visit https://lgatto.github.io/MSnbase/ to get started.
## 
## Attaching package: 'MSnbase'
## The following object is masked from 'package:stats':
## 
##     smooth
## The following object is masked from 'package:base':
## 
##     trimws
## 
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: future
## 
## Attaching package: 'future'
## The following object is masked from 'package:S4Vectors':
## 
##     values
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: GenomeInfoDb
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following objects are masked from 'package:base':
## 
##     aperm, apply
## 
## Attaching package: 'stageR'
## The following object is masked from 'package:methods':
## 
##     getMethod
source(paste0(wd,"/R/plot_functions.R"))

Give session info for reproducibility.

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] MSqRob_0.7.6                stageR_1.3.29              
##  [3] SummarizedExperiment_1.10.1 DelayedArray_0.6.0         
##  [5] BiocParallel_1.14.2         matrixStats_0.54.0         
##  [7] GenomicRanges_1.32.3        GenomeInfoDb_1.16.0        
##  [9] IRanges_2.14.12             furrr_0.1.0.9002           
## [11] future_1.13.0               readxl_1.3.1               
## [13] colorspace_1.4-1            zoo_1.8-6                  
## [15] lme4_1.1-21                 Matrix_1.2-17              
## [17] limma_3.36.5                MSnbase_2.8.3              
## [19] ProtGenerics_1.12.0         S4Vectors_0.18.3           
## [21] mzR_2.16.1                  Rcpp_1.0.1                 
## [23] Biobase_2.40.0              BiocGenerics_0.26.0        
## [25] forcats_0.4.0               stringr_1.4.0              
## [27] dplyr_0.8.1                 purrr_0.3.2                
## [29] readr_1.3.1                 tidyr_0.8.3                
## [31] tibble_2.1.3                ggplot2_3.1.1              
## [33] tidyverse_1.2.1             MSstats_3.12.2             
## [35] usethis_1.5.0               devtools_2.0.2             
## [37] remotes_2.0.4               here_0.1                   
## 
## loaded via a namespace (and not attached):
##  [1] minqa_1.2.4            rprojroot_1.3-2        XVector_0.20.0        
##  [4] fs_1.3.1               rstudioapi_0.10        listenv_0.7.0         
##  [7] affyio_1.50.0          ggrepel_0.8.1          lubridate_1.7.4       
## [10] xml2_1.2.0             codetools_0.2-16       splines_3.6.0         
## [13] doParallel_1.0.14      ncdf4_1.16.1           impute_1.54.0         
## [16] knitr_1.23             pkgload_1.0.2          jsonlite_1.6          
## [19] nloptr_1.2.1           broom_0.5.2            vsn_3.48.1            
## [22] BiocManager_1.30.4     compiler_3.6.0         httr_1.4.0            
## [25] backports_1.1.4        assertthat_0.2.1       lazyeval_0.2.2        
## [28] cli_1.1.0              htmltools_0.3.6        prettyunits_1.0.2     
## [31] tools_3.6.0            GenomeInfoDbData_1.1.0 gtable_0.3.0          
## [34] glue_1.3.1             affy_1.58.0            reshape2_1.4.3        
## [37] MALDIquant_1.19.3      cellranger_1.1.0       gdata_2.18.0          
## [40] preprocessCore_1.42.0  nlme_3.1-140           iterators_1.0.10      
## [43] xfun_0.7               globals_0.12.4         ps_1.3.0              
## [46] testthat_2.1.1         rvest_0.3.4            gtools_3.8.1          
## [49] XML_3.98-1.20          zlibbioc_1.26.0        MASS_7.3-51.4         
## [52] scales_1.0.0           BiocInstaller_1.30.0   pcaMethods_1.72.0     
## [55] hms_0.4.2              doSNOW_1.0.16          yaml_2.2.0            
## [58] memoise_1.1.0          stringi_1.4.3          desc_1.2.0            
## [61] foreach_1.4.4          randomForest_4.6-14    caTools_1.17.1.2      
## [64] boot_1.3-22            pkgbuild_1.0.3         rlang_0.3.4           
## [67] pkgconfig_2.0.2        bitops_1.0-6           mzID_1.18.0           
## [70] evaluate_0.14          lattice_0.20-38        processx_3.3.1        
## [73] tidyselect_0.2.5       plyr_1.8.4             magrittr_1.5          
## [76] R6_2.4.0               snow_0.4-3             gplots_3.0.1.1        
## [79] generics_0.0.2         pillar_1.4.1           haven_2.1.0           
## [82] withr_2.1.2            RCurl_1.95-4.12        survival_2.44-1.1     
## [85] modelr_0.1.4           crayon_1.3.4           KernSmooth_2.23-15    
## [88] rmarkdown_1.13         grid_3.6.0             minpack.lm_1.2-1      
## [91] data.table_1.12.2      marray_1.58.0          callr_3.2.0           
## [94] digest_0.6.19          munsell_0.5.0          sessioninfo_1.1.1

Important: make sure you have run the CPTAC and HEART analyses, or load the necessary files as follows:

load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC2.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_KNN1.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_Perseus_imp.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_co_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_Hurdle.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_PI_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_KNN1_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_MSstats_full.RData"))
load(paste0(wd, "/save_files_CPTAC/CPTAC_Perseus_full.RData"))
load(paste0(wd, "/save_files_CPTAC/CPTAC_Perseus_imp_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_ProPCA_full.RData"))
# testResultOneComparison is too big and cannot be put on our GitHub page, but can be obtained by running 7_analysis_CPTAC_MSstats.Rmd
load(paste0(wd, "/save_files_CPTAC/testResultOneComparison.RData"))
load(paste0(wd, "/save_files_PXD006675/peptides_HEART2.RData"))
load(paste0(wd, "/save_files_PXD006675/res_HEART_Hurdle.RData"))
load(paste0(wd, "/save_files_PXD006675/overlap_PHM.RData"))

1. Evaluate the imputation: protein level (Supplementary Fig. 1 and 2)

We here show the sample-wise effect of kNN and Perseus imputation at the level of MaxQuant’s LFQ-values (protein level). Blue are the original values, red are the imputed values.

# Import non-imputed data
file.proteinGroups <- paste0(wd,"/datasets/CPTAC/proteinGroups_preprocessed.txt")
exprs_col <- grepEcols(file.proteinGroups, "LFQ.intensity.", split = "\t")
proteinGroups.CPTAC <- readMSnSet2(file.proteinGroups, ecol = exprs_col, fnames = c("Protein.IDs"), sep = "\t")
sampleNames(proteinGroups.CPTAC) <- str_replace(sampleNames(proteinGroups.CPTAC), "LFQ.intensity.", "") %>% make.names

# Import Perseus-imputed data
file.proteinGroups <- paste0(wd,"/datasets/CPTAC/proteinGroups_preprocessed_imputed.txt")
exprs_col <- grepEcols(file.proteinGroups, "LFQ.intensity.", split = "\t")
proteinGroups.CPTAC.Perseus.imp <- readMSnSet2(file.proteinGroups, ecol = exprs_col, fnames = c("Protein.IDs"), sep = "\t")
sampleNames(proteinGroups.CPTAC.Perseus.imp) <- str_replace(sampleNames(proteinGroups.CPTAC.Perseus.imp), "LFQ.intensity.", "") %>% make.names

sample.names <- paste0(sampleNames(proteinGroups.CPTAC) %>% substr(3, 3), sampleNames(proteinGroups.CPTAC) %>% substr(5, 5))

### Impute with KNN-1
proteinGroups.CPTAC.KNN1 <- impute(proteinGroups.CPTAC, method = "knn")
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 700 rows with more than 50 % entries missing;
##  mean imputation used for these rows
cex.main <- 1.2
cex.lab <- 1
cex <- 1
par(mfrow=c(1,1))

# 1. Histograms for the effect of Perseus imputation LFQ-level (Supplementary Fig. 1)

# grDevices::svg(paste0(wd,"/Perseus_protein_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))

for(i in 1:ncol(exprs(proteinGroups.CPTAC.Perseus.imp))){
p1 <- hist(exprs(proteinGroups.CPTAC.Perseus.imp[,i]), breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC[,i]), breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(12,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation sample ", sample.names[i]), xlab = "log2(LFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(12,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# dev.off()

# 2. Histograms for the effect of kNN imputation at LFQ-level (Supplementary Fig. 2)

# grDevices::svg(paste0(wd,"/kNN_protein_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))

for(i in 1:ncol(exprs(proteinGroups.CPTAC.KNN1))){
p1 <- hist(exprs(proteinGroups.CPTAC.KNN1)[,i], breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC)[,i], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("kNN imputation sample ", sample.names[i]), xlab = "log2(LFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# dev.off()

2. Evaluate the imputation: peptide level (Supplementary Fig. 3 and 4)

We here show the sample-wise effect of kNN and Perseus imputation at the peptide level. Blue are the original values, red are the imputed values.

# 3. Histograms for the effect of Perseus imputation at peptide-level (Supplementary Fig. 3)

# grDevices::svg(paste0(wd,"/Perseus_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))

for(i in 1:ncol(exprs(peptides.CPTAC.Perseus.imp))){
p1 <- hist(exprs(peptides.CPTAC.Perseus.imp[,i]), breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2[,i]), breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(12,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation sample ", sample.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(12,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# dev.off()

# 4. Histograms for the effect of kNN imputation at peptide-level (Supplementary Fig. 4)

sample.names <- paste0(sampleNames(peptides.CPTAC.KNN1) %>% substr(3, 3), sampleNames(peptides.CPTAC.KNN1) %>% substr(5, 5))

cex.main <- 1.2
cex.lab <- 1
cex <- 1
par(mfrow=c(1,1))

# grDevices::svg(paste0(wd,"/kNN_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))

for(i in 1:ncol(exprs(peptides.CPTAC.KNN1))){
p1 <- hist(exprs(peptides.CPTAC.KNN1)[,i], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,i], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("kNN imputation sample ", sample.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex)  # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE)  # second histogram
}

# dev.off()

3. Make the FDR-FTP plots (Supplementary Fig. 5)

# FDR-FTP plot including MSstats, MSstats with Inf and kNN, based on our preprocessing

res.list <- list(res.CPTAC.full,
                 res.CPTAC.co.full,
                 res.CPTAC.Hurdle,
                 res.CPTAC.KNN1.full,
                 res.CPTAC.PI.full,
                 CPTAC.Perseus.full,
                 CPTAC.Perseus.imp.full,
                 res.CPTAC.MSstats.full,
                 res.CPTAC.MSstats.full[!is.infinite(res.CPTAC.MSstats.full$log2FC),],
                 res.CPTAC.ProPCA.full
)

contrast.vec <- c("condition6B-condition6A",
                  "condition6C-condition6B",
                  "condition6C-condition6A")

names(contrast.vec) <- c("Condition 6B vs. 6A",
                  "Condition 6C vs. 6B",
                  "Condition 6C vs. 6A")

colors <- c("#FF681E", "#E15E9E", "#5A2A82", "blue", "green", "#1B2944", "#42B7BA", "yellow3", "peru", "red2") # darkblue: "#1B2944"  gray: "#50FF00"

sort.list <- list(c("qvalue", "pvalue", "t", "logFC"),
                  c("qvalue", "pvalue", "t", "logOR"),
                  c("qchisq", "pchisq", "chisq", NA),
                  c("qvalue", "pvalue", "t", "logFC"),
                  c("qvalue", "pvalue", "t", "logFC"),
                  c("q.value", "p.value", "Test.statistic", "Difference"),
                  c("q.value", "p.value", "Test.statistic", "Difference"),
                  c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
                  c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
                  c("adj.P.Val", "P.Value", "t", "logFC"))

PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) #TRUE

4. Make the FDR-FTP plots using MSstats preprocessing as a basis (Supplementary Fig. 6)

# Since in our preprocessing, 2 significant proteins were removed from the MSstats output, we also redo the whole thing with MSstats preprocessing as a reference.

res.CPTAC.MSstats <- as_tibble(testResultOneComparison$ComparisonResult)

res.CPTAC.MSstats <- res.CPTAC.MSstats %>% dplyr::rename(protein = Protein, contrast = Label)

res.CPTAC.MSstats$contrast <- res.CPTAC.MSstats$contrast %>% as.character

res.CPTAC.MSstats$contrast[res.CPTAC.MSstats$contrast == "6B-6A"] <- "condition6B-condition6A"
res.CPTAC.MSstats$contrast[res.CPTAC.MSstats$contrast == "6C-6A"] <- "condition6C-condition6A"
res.CPTAC.MSstats$contrast[res.CPTAC.MSstats$contrast == "6C-6B"] <- "condition6C-condition6B"



res.CPTAC.Base2 <- res.CPTAC.MSstats %>% select(protein, contrast)
res.CPTAC.Base2$UPS <- grepl("ups", res.CPTAC.Base2$protein)

res.CPTAC.MSstats.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.MSstats)
## Joining, by = c("protein", "contrast")
res.CPTAC.MSstats.full2 <- res.CPTAC.MSstats.full2 %>% arrange(adj.pvalue,pvalue)

res.CPTAC.MSstats.noInf.full2 <- res.CPTAC.MSstats.full2

res.CPTAC.MSstats.noInf.full2[is.infinite(res.CPTAC.MSstats.noInf.full2$log2FC),][,c("log2FC", "SE", "Tvalue", "DF", "pvalue", "adj.pvalue")] <- NA
res.CPTAC.MSstats.noInf.full2 <- res.CPTAC.MSstats.noInf.full2 %>% arrange(adj.pvalue,pvalue)

# Perseus without imputation #

# 1. Import Perseus results

results.Perseus <- read.table(paste0(wd,"/datasets/CPTAC/t-tests_Perseus_1_6_0_7-6A-6B-6C.txt"), sep="\t", header=TRUE, quote = "", comment.char = "")

results.Perseus <- results.Perseus[-c(1,2),] %>% map_dfr(~{.x %>% as.character %>% type.convert})

# 2. Convert wide to long

results.Perseus <- results.Perseus[,43:58]
colnames(results.Perseus) <- gsub("Student.s.T.test.","",colnames(results.Perseus))

res.Perseus <- results.Perseus %>% reshape(timevar = "contrast", varying =,1:12, direction = "long", sep = ".6")
## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.
res.Perseus$contrast[res.Perseus$contrast == "B_6A"] <- "condition6B-condition6A"
res.Perseus$contrast[res.Perseus$contrast == "C_6A"] <- "condition6C-condition6A"
res.Perseus$contrast[res.Perseus$contrast == "C_6B"] <- "condition6C-condition6B"

res.Perseus <- res.Perseus %>% dplyr::rename(protein = Majority.protein.IDs, protein.name = Protein.names, gene.name = Gene.names) %>% select(-Protein.IDs)

UPS.Perseus <- unique(res.Perseus$protein)[grepl("ups", unique(res.Perseus$protein))]
UPS.MSstats <- unique(res.CPTAC.MSstats$protein)[grepl("ups", unique(res.CPTAC.MSstats$protein))]
UPS.Hurdle <- unique(res.CPTAC.Hurdle$protein)[grepl("ups", unique(res.CPTAC.Hurdle$protein))]

UPS.MSstats[!(UPS.MSstats %in% UPS.Perseus)]
## [1] P01008ups;CON__P41361             P68871ups;CON__P02070;CON__Q3SX09
## [3] P99999ups;CON__P62894            
## 1446 Levels: A5Z2X5 D6VTK4 O00762ups O13516 O13547 O13563 O14455 ... Q99385
# P01008ups;CON__P41361 P68871ups;CON__P02070;CON__Q3SX09 P99999ups;CON__P62894
UPS.Perseus[!(UPS.Perseus %in% UPS.MSstats)]
## [1] P01008ups P68871ups P99999ups
## 1462 Levels: A5Z2X5 D6VTK4 O00762ups O13516 O13547 O13563 O14455 ... Q99385
# P01008ups P68871ups P99999ups

# -> all three of them link perfectly with each other, thus we will fill them in

CPTAC.Perseus.full2 <- res.CPTAC.Base2 %>% left_join(res.Perseus)
## Joining, by = c("protein", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
CPTAC.Perseus.full2[CPTAC.Perseus.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus[res.Perseus$protein == "P01008ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]

CPTAC.Perseus.full2[CPTAC.Perseus.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus[res.Perseus$protein == "P68871ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]

CPTAC.Perseus.full2[CPTAC.Perseus.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus[res.Perseus$protein == "P99999ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]

CPTAC.Perseus.full2 <- CPTAC.Perseus.full2 %>% arrange(p.value)

# Perseus with imputation #

results.Perseus.imp <- read.table(paste0(wd,"/datasets/CPTAC/t-tests_Perseus_imputed_1_6_0_7-6A-6B-6C.txt"), sep="\t", header=TRUE, quote = "", comment.char = "")

results.Perseus.imp <- results.Perseus.imp[-c(1,2),] %>% map_dfr(~{.x %>% as.character %>% type.convert})

# 2. Convert wide to long

results.Perseus.imp <- results.Perseus.imp[,43:58]
colnames(results.Perseus.imp) <- gsub("Student.s.T.test.","",colnames(results.Perseus.imp))

res.Perseus.imp <- results.Perseus.imp %>% reshape(timevar = "contrast", varying =,1:12, direction = "long", sep = ".6")
## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.

## Warning: Setting row names on a tibble is deprecated.
res.Perseus.imp$contrast[res.Perseus.imp$contrast == "B_6A"] <- "condition6B-condition6A"
res.Perseus.imp$contrast[res.Perseus.imp$contrast == "C_6A"] <- "condition6C-condition6A"
res.Perseus.imp$contrast[res.Perseus.imp$contrast == "C_6B"] <- "condition6C-condition6B"

res.Perseus.imp <- res.Perseus.imp %>% dplyr::rename(protein = Majority.protein.IDs, protein.name = Protein.names, gene.name = Gene.names) %>% select(-Protein.IDs)

CPTAC.Perseus.imp.full2 <- res.CPTAC.Base2 %>% left_join(res.Perseus.imp)
## Joining, by = c("protein", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
CPTAC.Perseus.imp.full2[CPTAC.Perseus.imp.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus.imp[res.Perseus.imp$protein == "P01008ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]

CPTAC.Perseus.imp.full2[CPTAC.Perseus.imp.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus.imp[res.Perseus.imp$protein == "P68871ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]

CPTAC.Perseus.imp.full2[CPTAC.Perseus.imp.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus.imp[res.Perseus.imp$protein == "P99999ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]

CPTAC.Perseus.imp.full2 <- CPTAC.Perseus.imp.full2 %>% arrange(p.value)

# Hurdle model #

UPS.MSstats[!(UPS.MSstats %in% UPS.Hurdle)]
## [1] P01008ups;CON__P41361             P01112ups                        
## [3] P09211ups                         P62988ups                        
## [5] P68871ups;CON__P02070;CON__Q3SX09 P99999ups;CON__P62894            
## 1446 Levels: A5Z2X5 D6VTK4 O00762ups O13516 O13547 O13563 O14455 ... Q99385
# P01008ups;CON__P41361 P01112ups P09211ups P62988ups P68871ups;CON__P02070;CON__Q3SX09 P99999ups;CON__P62894
UPS.Hurdle[!(UPS.Hurdle %in% UPS.MSstats)]
## [1] P99999ups               P01008ups               P68871ups              
## [4] P05759;P0CG63;P62988ups
## 1655 Levels:  A5Z2X5 ... Q99385
# P99999ups               P01008ups               P68871ups   P05759;P0CG63;P62988ups

# -> everything except P01112ups and P09211ups link up
# => insert the estimates of the 4 others

res.CPTAC.Hurdle2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.Hurdle)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
res.CPTAC.Hurdle2[res.CPTAC.Hurdle2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")] <- res.CPTAC.Hurdle[res.CPTAC.Hurdle$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")]

res.CPTAC.Hurdle2[res.CPTAC.Hurdle2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")] <- res.CPTAC.Hurdle[res.CPTAC.Hurdle$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")]

res.CPTAC.Hurdle2[res.CPTAC.Hurdle2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")] <- res.CPTAC.Hurdle[res.CPTAC.Hurdle$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")]

res.CPTAC.Hurdle2[res.CPTAC.Hurdle2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")] <- res.CPTAC.Hurdle[res.CPTAC.Hurdle$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")]

res.CPTAC.Hurdle2 <- res.CPTAC.Hurdle2 %>% arrange(pchisq)

# MSqRob without preprocessing #

res.CPTAC.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.full)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
res.CPTAC.full2[res.CPTAC.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.full[res.CPTAC.full$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.full2[res.CPTAC.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.full[res.CPTAC.full$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.full2[res.CPTAC.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.full[res.CPTAC.full$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.full2[res.CPTAC.full2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.full[res.CPTAC.full$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.full2 <- res.CPTAC.full2 %>% arrange(pvalue)

# Quasibinomial model #

res.CPTAC.co.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.co.full)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
res.CPTAC.co.full2[res.CPTAC.co.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.co.full[res.CPTAC.co.full$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.co.full2[res.CPTAC.co.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.co.full[res.CPTAC.co.full$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.co.full2[res.CPTAC.co.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.co.full[res.CPTAC.co.full$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.co.full2[res.CPTAC.co.full2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.co.full[res.CPTAC.co.full$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.co.full2 <- res.CPTAC.co.full2 %>% arrange(pvalue)

# MSqRob with kNN imputation #

res.CPTAC.KNN1.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.KNN1.full)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
res.CPTAC.KNN1.full2[res.CPTAC.KNN1.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.KNN1.full[res.CPTAC.KNN1.full$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.KNN1.full2[res.CPTAC.KNN1.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.KNN1.full[res.CPTAC.KNN1.full$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.KNN1.full2[res.CPTAC.KNN1.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.KNN1.full[res.CPTAC.KNN1.full$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.KNN1.full2[res.CPTAC.KNN1.full2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.KNN1.full[res.CPTAC.KNN1.full$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.KNN1.full2 <- res.CPTAC.KNN1.full2 %>% arrange(pvalue)

# MSqRob with Perseus imputation #

res.CPTAC.PI.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.PI.full)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
res.CPTAC.PI.full2[res.CPTAC.PI.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.PI.full[res.CPTAC.PI.full$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.PI.full2[res.CPTAC.PI.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.PI.full[res.CPTAC.PI.full$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.PI.full2[res.CPTAC.PI.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.PI.full[res.CPTAC.PI.full$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.PI.full2[res.CPTAC.PI.full2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.PI.full[res.CPTAC.PI.full$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]

res.CPTAC.PI.full2 <- res.CPTAC.PI.full2 %>% arrange(pvalue)

# ProPCA with limma #

res.CPTAC.ProPCA.full
## # A tibble: 4,143 x 11
##    protein UPS   gene.name protein.name contrast logFC AveExpr     t
##    <chr>   <lgl> <fct>     <fct>        <chr>    <dbl>   <dbl> <dbl>
##  1 Q06830… TRUE  ""        ""           conditi… 0.552   1.91  32.9 
##  2 Q06830… TRUE  ""        ""           conditi… 0.313   1.91  18.7 
##  3 Q06830… TRUE  ""        ""           conditi… 0.238   1.91  14.2 
##  4 P12081… TRUE  ""        ""           conditi… 0.822   1.52   9.85
##  5 P10636… TRUE  ""        ""           conditi… 1.00    2.12   9.45
##  6 P00915… TRUE  ""        ""           conditi… 0.336   0.633  7.45
##  7 P12081… TRUE  ""        ""           conditi… 0.611   1.52   7.32
##  8 P02144… TRUE  ""        ""           conditi… 0.159   0.877  7.00
##  9 P10145… TRUE  ""        ""           conditi… 0.145   0.876  6.89
## 10 P55957… TRUE  ""        ""           conditi… 0.593   0.688  6.65
## # … with 4,133 more rows, and 3 more variables: P.Value <dbl>,
## #   adj.P.Val <dbl>, B <dbl>
res.CPTAC.ProPCA.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.ProPCA.full)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
res.CPTAC.ProPCA.full2[res.CPTAC.ProPCA.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")] <- res.CPTAC.ProPCA.full[res.CPTAC.ProPCA.full$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")]

res.CPTAC.ProPCA.full2[res.CPTAC.ProPCA.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")] <- res.CPTAC.ProPCA.full[res.CPTAC.ProPCA.full$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")]

res.CPTAC.ProPCA.full2[res.CPTAC.ProPCA.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")] <- res.CPTAC.ProPCA.full[res.CPTAC.ProPCA.full$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")]

res.CPTAC.ProPCA.full2[res.CPTAC.ProPCA.full2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")] <- res.CPTAC.ProPCA.full[res.CPTAC.ProPCA.full$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")]

res.CPTAC.ProPCA.full2 <- res.CPTAC.ProPCA.full2 %>% arrange(P.Value)


### FDR-FTP plots with MSstats preprocessing as a basis (Supplementary Fig. 6) ###

res.list <- list(res.CPTAC.full2,
                 res.CPTAC.co.full2,
                 res.CPTAC.Hurdle2,
                 res.CPTAC.KNN1.full2,
                 res.CPTAC.PI.full2,
                 CPTAC.Perseus.full2,
                 CPTAC.Perseus.imp.full2,
                 res.CPTAC.MSstats.full2,
                 res.CPTAC.MSstats.full2[!is.infinite(res.CPTAC.MSstats.full2$log2FC),],
                 res.CPTAC.ProPCA.full2
)

contrast.vec <- c("condition6B-condition6A",
                  "condition6C-condition6B",
                  "condition6C-condition6A")

names(contrast.vec) <- c("Condition 6B vs. 6A",
                  "Condition 6C vs. 6B",
                  "Condition 6C vs. 6A")

colors <- c("#FF681E", "#E15E9E", "#5A2A82", "blue", "green", "#1B2944", "#42B7BA", "yellow3", "peru", "red2") # darkblue: "#1B2944"  gray: "#50FF00"

sort.list <- list(c("qvalue", "pvalue", "t", "logFC"),
                  c("qvalue", "pvalue", "t", "logOR"),
                  c("qchisq", "pchisq", "chisq", NA),
                  c("qvalue", "pvalue", "t", "logFC"),
                  c("qvalue", "pvalue", "t", "logFC"),
                  c("q.value", "p.value", "Test.statistic", "Difference"),
                  c("q.value", "p.value", "Test.statistic", "Difference"),
                  c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
                  c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
                  c("adj.P.Val", "P.Value", "t", "logFC"))

PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) #TRUE

5. Correlation between intensities and counts (Supplementary Fig. 7)

# Make the plots

# BA, CB, CA
upratio <- c(1.565597, 1.571906, 3.137504)

# 1. Top: condition 6B vs. 6A

TP.BvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6B-condition6A" & UPS)

FP.BvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6B-condition6A" & !UPS)

scatterHist(x = ((FP.BvsA$logOR) %>% unlist),
            y = ((FP.BvsA$logFC) %>% unlist),
            x2 = ((TP.BvsA$logOR) %>% unlist),
            y2 = ((TP.BvsA$logFC) %>% unlist), 
            main = "Condition B vs. A", plotSVG = FALSE, filename = paste0(wd,"/scatterhistBA.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistBA.svg")

sum(is.na(((FP.BvsA$logFC) %>% unlist)))
## [1] 159
# 159 yeast proteins without a log2FC estimate
sum(is.na(((TP.BvsA$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate

# 2. Middle: condition 6C vs. 6A

TP.CvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6A" & UPS)

FP.CvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6A" & !UPS)

scatterHist(x = ((FP.CvsA$logOR) %>% unlist),
            y = ((FP.CvsA$logFC) %>% unlist),
            x2 = ((TP.CvsA$logOR) %>% unlist),
            y2 = ((TP.CvsA$logFC) %>% unlist), 
            main = "Condition C vs. A", plotSVG = FALSE, filename = paste0(wd,"/scatterhistCA.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistCA.svg")

sum(is.na(((FP.CvsA$logFC) %>% unlist)))
## [1] 158
# 158 yeast proteins without a log2FC estimate
sum(is.na(((TP.CvsA$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate

# 3. Bottom: condition 6C vs. 6B

TP.CvsB <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6B" & UPS)

FP.CvsB <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6B" & !UPS)

scatterHist(x = ((FP.CvsB$logOR) %>% unlist),
            y = ((FP.CvsB$logFC) %>% unlist),
            x2 = ((TP.CvsB$logOR) %>% unlist),
            y2 = ((TP.CvsB$logFC) %>% unlist), 
            main = "Condition C vs. B", plotSVG = FALSE, filename = paste0(wd,"/scatterhistCB.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistCB.svg")

sum(is.na(((FP.CvsB$logFC) %>% unlist)))
## [1] 159
# 159 yeast proteins without a log2FC estimate
sum(is.na(((TP.CvsB$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate

6. Plot example proteins for each of the overlapping regions in the Venn diagram (Supplementary Fig. 8 - 12)

Aditionally, we here provide the plots for the underlying (preprocessed) peptide-level data for the 1500 most significant proteins for each method in the HEART dataset, grouped per method.

### First 1500 most significant genes ###

# A. Select top 1500 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.1500 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:1500]

# B. Select top 1500 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.1500 <- hurdle.AvsV.ov[1:1500,]$gene.name

# C. Select top 1500 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.1500 <- MSqRob.AvsV.ov[1:1500,]$gene.name


### Overlap all (Supplementary Fig. 8) ###

plot_proteins(gene.names = "MYL7", title = "MYL7", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "PAM", title = "PAM", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "MYBPHL", title = "MYBPHL", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "MAP1B", title = "MAP1B", msnset = peptides.HEART2, pdf = FALSE)

# plot_proteinsSVG(gene.names = c("MYL7", "PAM", "MYBPHL", "MAP1B"), title = c("overlap_all.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from all three methods
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & (genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm(Perseus.pvals/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_all.pdf", msnset = peptides.HEART2, pdf = TRUE)


### Overlap Hurdle and Perseus (Supplementary Fig. 9) ###

plot_proteins(gene.names = "CA13", title = "CA13", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "NTN1", title = "NTN1", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "PRKG2", title = "PRKG2", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "SBF2", title = "SBF2", msnset = peptides.HEART2, pdf = FALSE)

# plot_proteinsSVG(gene.names = c("CA13", "NTN1", "PRKG2", "SBF2"), title = c("overlap_hurdle_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Hurdle and Perseus and against MSqRob
genes <- genes.Hurdle.AvsV.1500[(genes.Hurdle.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.Hurdle.AvsV.1500 %in% genes.MSqRob.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm(Perseus.pvals/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_hurdle_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)


### Hurdle alone (Supplementary Fig. 10) ###

plot_proteins(gene.names = "HTT", title = "HTT", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "ACACA", title = "ACACA", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "ABCA6", title = "ABCA6", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "FTCD", title = "FTCD", msnset = peptides.HEART2, pdf = FALSE)

# plot_proteinsSVG(gene.names = c("HTT", "ACACA", "ABCA6", "FTCD"), title = c("only_hurdle.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Hurdle and against MSqRob and Perseus
genes <- genes.Hurdle.AvsV.1500[!(genes.Hurdle.AvsV.1500 %in% genes.MSqRob.AvsV.1500) & !(genes.Hurdle.AvsV.1500 %in% genes.Perseus.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_hurdle.pdf", msnset = peptides.HEART2, pdf = TRUE)


### Perseus alone (Supplementary Fig. 11) ###

plot_proteins(gene.names = "GJA5", title = "GJA5", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "ASNSD1", title = "ASNSD1", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "MTCL1", title = "MTCL1", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "BLOC1S5;BLOC1S5-TXNDC5", title = "BLOC1S5;BLOC1S5-TXNDC5", msnset = peptides.HEART2, pdf = FALSE)

# plot_proteinsSVG(gene.names = c("GJA5", "ASNSD1", "MTCL1", "BLOC1S5;BLOC1S5-TXNDC5"), title = c("only_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Perseus and against Hurdle and MSqRob
genes <- genes.Perseus.AvsV.1500[!(genes.Perseus.AvsV.1500 %in% genes.MSqRob.AvsV.1500) & !(genes.Perseus.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm(Perseus.pvals/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)


### Overlap MSqRob and Perseus (Supplementary Fig. 12) ###

plot_proteins(gene.names = "AK4", title = "AK4", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "NDUFV1", title = "NDUFV1", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "DECR1", title = "DECR1", msnset = peptides.HEART2, pdf = FALSE)

plot_proteins(gene.names = "SDHA", title = "SDHA", msnset = peptides.HEART2, pdf = FALSE)

# plot_proteinsSVG(gene.names = c("AK4", "NDUFV1", "DECR1", "SDHA"), title = c("overlap_MSqRob_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from MSqRob and Perseus and against Hurdle
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm(Perseus.pvals/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_MSqRob_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)


### MSqRob alone ###

# Order p-values by evidence from MSqRob and against Hurdle and Perseus
genes <- genes.MSqRob.AvsV.1500[!(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_MSqRob.pdf", msnset = peptides.HEART2, pdf = TRUE)


### Overlap MSqRob and Hurdle ###

# Order p-values by evidence from MSqRob and Hurdle and against Perseus
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500)] %>% as.character

Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]

Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]

Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]

MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1

Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1

hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1

genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_MSqRob_Hurdle.pdf", msnset = peptides.HEART2, pdf = TRUE)

7. Generate the data for the other Venn diagrams for the HEART dataset (Supplementary Fig. 13)

### Venn diagrams atrial vs. ventricular region ###

### Checks: ###

all(Perseus.AvsV.ov$`Gene names` %in% hurdle.AvsV.ov$gene.name)
## [1] TRUE
all(Perseus.AvsV.ov$`Gene names` %in% MSqRob.AvsV.ov$gene.name)
## [1] TRUE
all(hurdle.AvsV.ov$gene.name %in% Perseus.AvsV.ov$`Gene names`)
## [1] TRUE
all(MSqRob.AvsV.ov$gene.name %in% Perseus.AvsV.ov$`Gene names`)
## [1] TRUE
dim(MSqRob.AvsV.ov)
## [1] 7822    9
dim(hurdle.AvsV.ov)
## [1] 7822    8
length(unique(Perseus.AvsV.ov$`Gene names`))
## [1] 7822
################

### Top: the first 500 significant genes ###

# A. Select top 500 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.500 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:500]

# B. Select top 500 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.500 <- hurdle.AvsV.ov[1:500,]$gene.name

# C. Select top 500 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.500 <- MSqRob.AvsV.ov[1:500,]$gene.name

# MSqRob alone

sum(!(genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 99
# 99

# Hurdle alone

sum(!(genes.Hurdle.AvsV.500 %in% genes.MSqRob.AvsV.500) & !(genes.Hurdle.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 85
# 85

# Perseus alone

sum(!(genes.Perseus.AvsV.500 %in% genes.MSqRob.AvsV.500) & !(genes.Perseus.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 218
# 218

# Overlap MSqRob and Hurdle

sum((genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 158
# 158

# Overlap MSqRob and Perseus

sum((genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 25
# 25

# Overlap Hurdle and Perseus

sum((genes.Hurdle.AvsV.500 %in% genes.Perseus.AvsV.500) & !(genes.Hurdle.AvsV.500 %in% genes.MSqRob.AvsV.500))
## [1] 39
# 39

# Overlap everything

sum((genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500) & (genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 218
# 218

# Control:
length(genes.MSqRob.AvsV.500)
## [1] 500
99+218+158+25
## [1] 500
length(genes.Hurdle.AvsV.500)
## [1] 500
85+158+218+39
## [1] 500
length(genes.Perseus.AvsV.500)
## [1] 500
218+39+25+218
## [1] 500
### Bottom: the first 1000 significant genes ###

# A. Select top 1000 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.1000 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:1000]

# B. Select top 1000 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.1000 <- hurdle.AvsV.ov[1:1000,]$gene.name

# C. Select top 1000 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.1000 <- MSqRob.AvsV.ov[1:1000,]$gene.name

# MSqRob alone

sum(!(genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 174
# 174

# Hurdle alone

sum(!(genes.Hurdle.AvsV.1000 %in% genes.MSqRob.AvsV.1000) & !(genes.Hurdle.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 134
# 134

# Perseus alone

sum(!(genes.Perseus.AvsV.1000 %in% genes.MSqRob.AvsV.1000) & !(genes.Perseus.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 439
# 439

# Overlap MSqRob and Hurdle

sum((genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 363
# 363

# Overlap MSqRob and Perseus

sum((genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 58
# 58

# Overlap Hurdle and Perseus

sum((genes.Hurdle.AvsV.1000 %in% genes.Perseus.AvsV.1000) & !(genes.Hurdle.AvsV.1000 %in% genes.MSqRob.AvsV.1000))
## [1] 98
# 98

# Overlap everything

sum((genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000) & (genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 405
# 405

# Control:
length(genes.MSqRob.AvsV.1000)
## [1] 1000
174+363+58+405
## [1] 1000
length(genes.Hurdle.AvsV.1000)
## [1] 1000
134+363+98+405
## [1] 1000
length(genes.Perseus.AvsV.1000)
## [1] 1000
439+58+98+405
## [1] 1000

8. On the independence of the z-statistics under the null hypothesis (Supplementary Fig. 14)

# Create a mock experimental design where there is no differential abundance in reality
# There is however a huge "batch effect" of "condlab", the combined effects of spike-in condition and lab
pData <- pData(peptides.CPTAC2)
pData$condlab <- paste0(pData$condition,pData$lab)
pData$treat <- rep(c("one","two","three"),9)
pData$treat <- factor(pData$treat, levels = c("one", "two", "three"))

peptides.CPTAC2.noeff <- MSnSet(Biobase::exprs(peptides.CPTAC2), fData = AnnotatedDataFrame(fData(peptides.CPTAC2)), pData = AnnotatedDataFrame(pData))

### Recreate peptides.CPTAC3 ###

# We require by default at least 2 identifications of a peptide sequence, as with 1 identification, the model will be perfectly confounded
sel <- rowSums(!is.na(Biobase::exprs(peptides.CPTAC2))) >= 2
peptides.CPTAC3 <- peptides.CPTAC2[sel]

# Again remove proteins that are only identified by one peptide
sel <- fData(peptides.CPTAC3) %>% group_by(protein) %>% mutate(flag = n() > 1) %>% pull(flag)
peptides.CPTAC3 <- peptides.CPTAC3[sel]

# Drop levels
peptides.CPTAC3 <- MSnbase::MSnSet(exprs = Biobase::exprs(peptides.CPTAC3), fData = droplevels(Biobase::fData(peptides.CPTAC3)), pData = Biobase::pData(peptides.CPTAC3))

dim(peptides.CPTAC2)
## [1] 9377   27
# 9377   27
dim(peptides.CPTAC3)
## [1] 9158   27
# 9158   27
length(unique(fData(peptides.CPTAC2)$protein))
## [1] 1381
# 1381
length(unique(fData(peptides.CPTAC3)$protein))
## [1] 1347
# 1347

rm(sel)
gc()
##            used  (Mb) gc trigger   (Mb) limit (Mb) max used   (Mb)
## Ncells 14272080 762.3   24211394 1293.1         NA 20215918 1079.7
## Vcells 51878250 395.8   81269643  620.1      16384 67657195  516.2
######

peptides.CPTAC3.noeff <- MSnSet(Biobase::exprs(peptides.CPTAC3), fData = AnnotatedDataFrame(fData(peptides.CPTAC3)), pData = AnnotatedDataFrame(pData))

# Create contrast matrix for the mock analysis
form <- expression ~ (1|treat) + (1|condlab) + (1|sample) + (1|sequence)
contrasts <- contrast_helper(form, peptides.CPTAC3.noeff, treat)
contrasts <- contrasts[c(1,3,2),c(2,1,3)]
colnames(contrasts)[3] <- "treatthree-treattwo"
contrasts[c(2,3),3] <- c(-1,1)
contrasts
##            treattwo-treatone treatthree-treatone treatthree-treattwo
## treatone                  -1                  -1                   0
## treattwo                   1                   0                  -1
## treatthree                 0                   1                   1
# Fit the models, this might take a few minutes
# res.CPTAC.noeff <- do_mm(formula = form, msnset = peptides.CPTAC3.noeff, group_var = protein, contrasts = contrasts, type_df = "conservative", max_iter = 20)
# Or load the data
load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff.RData"))

annotation.CPTAC <- tibble(protein = fData(peptides.CPTAC2)$protein, UPS = grepl("ups",fData(peptides.CPTAC2)$protein), gene.name = fData(peptides.CPTAC2)$gene.name, protein.name = fData(peptides.CPTAC2)$protein.name) %>% distinct
res.CPTAC.Base <- rbind(annotation.CPTAC, annotation.CPTAC, annotation.CPTAC)
res.CPTAC.Base$contrast <- c(rep("treattwo-treatone",nrow(annotation.CPTAC)), rep("treatthree-treatone",nrow(annotation.CPTAC)), rep("treatthree-treattwo",nrow(annotation.CPTAC)))

res.CPTAC.noeff.full <- res.CPTAC.Base %>% left_join(res.CPTAC.noeff$result)
## Joining, by = c("protein", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
res.CPTAC.noeff.full <- res.CPTAC.noeff.full %>% arrange(pvalue)

prot.CPTAC.noeff.co <- do_count_groups(peptides.CPTAC2.noeff, group_var = protein, keep_fData_cols = c("gene.name","protein.name"))

# Remove estimates based on only one sample in one or more conditions from the data

not_one <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(1,25, by=3)] > 0) %>% rowSums < 2)
not_two <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(2,26, by=3)] > 0) %>% rowSums < 2)
not_three <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(3,27, by=3)] > 0) %>% rowSums < 2)

### Important: re-adjust the false discovery rate! ###

res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_one)) & (res.CPTAC.noeff.full$contrast %in% c("treattwo-treatone", "treatthree-treatone")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA
res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_two)) & (res.CPTAC.noeff.full$contrast %in% c("treattwo-treatone", "treatthree-treattwo")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA
res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_three)) & (res.CPTAC.noeff.full$contrast %in% c("treatthree-treatone", "treatthree-treattwo")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA

res.CPTAC.noeff.full <- res.CPTAC.noeff.full %>% group_by(contrast) %>% 
  mutate(qvalue = p.adjust(pvalue, method = "BH"))

# Or load the MSqRob result object via:
# load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff_full.RData"))

### Fit the quasi-binomial model ###

form.co <- ~ treat + condlab
# contrasts <- contrast_helper(~ treat, peptides.CPTAC2.noeff, treat)

# res.CPTAC.noeff.co <- do_glm(formula = form.co, msnset = prot.CPTAC.noeff.co, group_var = protein, contrasts = contrasts, contFun = "contEst")
# OR: load the model:
load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff_co.RData"))

res.CPTAC.noeff.co.full <- res.CPTAC.Base %>% left_join(res.CPTAC.noeff.co$result)
## Joining, by = c("protein", "contrast")
res.CPTAC.noeff.co.full <- res.CPTAC.noeff.co.full %>% arrange(pvalue)

# cols <-  colorRampPalette(c("#FF3100", "#FCFF00", "#45FE4F",
#                             "#00FEFF", "#000099" 
# ))(256)

# col = cols[ceiling(rank(HEART_A_vsV$pchisq)/length(HEART_A_vsV$pchisq)*256)]

# Order results according to protein and contrast instead of p-value
res.intensities.sorted <- res.CPTAC.noeff.full %>% arrange(contrast, protein)
res.counts.sorted <- res.CPTAC.noeff.co.full %>% arrange(contrast, protein)

all(res.intensities.sorted$protein == res.counts.sorted$protein)
## [1] TRUE
all(res.intensities.sorted$contrast == res.counts.sorted$contrast)
## [1] TRUE
# library(svglite)
# svglite(file = "all_mock_correlations.svg", width = 14, height = 14)
# # svglite(file = "no_correlation_MSqRob_qbinom_21.svg", width = 7, height = 7)
# par(mar=c(5.1,6.1,4.1,2.1))
# par(mfrow = c(2,2))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treattwo-treatone",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treattwo-treatone",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 2 vs. 1", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")

# # par(mar=c(5.1,4.1,4.1,2.1))
# # dev.off()
# 
# # library(svglite)
# # svglite(file = "no_correlation_MSqRob_qbinom_31.svg", width = 7, height = 7)
# # par(mar=c(5.1,6.1,4.1,2.1))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treatthree-treatone",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treatthree-treatone",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 3 vs. 1", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")

# # par(mar=c(5.1,4.1,4.1,2.1))
# # dev.off()
# 
# # library(svglite)
# # svglite(file = "no_correlation_MSqRob_qbinom_32.svg", width = 7, height = 7)
# # par(mar=c(5.1,6.1,4.1,2.1))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treatthree-treattwo",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treatthree-treattwo",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 3 vs. 2", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")

# # par(mar=c(5.1,4.1,4.1,2.1))
# par(mfrow = c(1,1))
# dev.off()